# Author: Stephen Situ
# Apache Spark is an open-source unified analytics engine for large-scale data processing. Using parallel computing, it allows us to
# process data at speeds needed for machine learning, AI, and big data. This is a practice project on using Pyspark and Spark SQL
# in Python. We get familiar with spark dataframes and try to see how they differ from pandas dataframes. Additionally, we practice
# using Spark SQL to run SQL queries on Spark dataframes.
# Import libraries
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
# Create spark session
spark = SparkSession.builder.appName('Test').getOrCreate()
# Check spark session is running
spark
SparkSession - in-memory
# Read CSV
df = spark.read.csv('netflix1.csv')
df.show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ | _c0| _c1| _c2| _c3| _c4| _c5| _c6| _c7| _c8| _c9| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| | s1| Movie|Dick Johnson Is Dead| Kirsten Johnson| United States| 9/25/2021| 2020| PG-13| 90 min| Documentaries| | s3|TV Show| Ganglands| Julien Leclercq| France| 9/24/2021| 2021| TV-MA| 1 Season|Crime TV Shows, I...| | s6|TV Show| Midnight Mass| Mike Flanagan| United States| 9/24/2021| 2021| TV-MA| 1 Season|TV Dramas, TV Hor...| | s14| Movie|Confessions of an...| Bruno Garotti| Brazil| 9/22/2021| 2021| TV-PG| 91 min|Children & Family...| | s8| Movie| Sankofa| Haile Gerima| United States| 9/24/2021| 1993| TV-MA| 125 min|Dramas, Independe...| | s9|TV Show|The Great British...| Andy Devonshire|United Kingdom| 9/24/2021| 2021| TV-14|9 Seasons|British TV Shows,...| | s10| Movie| The Starling| Theodore Melfi| United States| 9/24/2021| 2021| PG-13| 104 min| Comedies, Dramas| | s939| Movie|Motu Patlu in the...| Suhas Kadav| India| 5/1/2021| 2019| TV-Y7| 87 min|Children & Family...| | s13| Movie| Je Suis Karl| Christian Schwochow| Germany| 9/23/2021| 2021| TV-MA| 127 min|Dramas, Internati...| | s940| Movie|Motu Patlu in Won...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 76 min|Children & Family...| | s941| Movie|Motu Patlu: Deep ...| Suhas Kadav| India| 5/1/2021| 2014| TV-Y7| 76 min|Children & Family...| | s942| Movie|Motu Patlu: Missi...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 71 min|Children & Family...| | s852| Movie| 99 Songs (Tamil)| Not Given| Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| | s471| Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021| 2021| TV-14| 39 min| Movies| | s730| Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 36 min| Movies| | s731| Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 34 min| Movies| | s913| Movie|The Circle - The ...|Krysia Plonka, Kr...| United States| 5/7/2021| 2021| TV-14| 35 min| Comedies| | s4|TV Show|Jailbirds New Orl...| Not Given| Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| | s15|TV Show|Crime Stories: In...| Not Given| Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Read CSV with option header = True & inferSchema = True
df = spark.read.csv('netflix1.csv',header=True,inferSchema=True)
df.show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ | s1| Movie|Dick Johnson Is Dead| Kirsten Johnson| United States| 9/25/2021| 2020| PG-13| 90 min| Documentaries| | s3|TV Show| Ganglands| Julien Leclercq| France| 9/24/2021| 2021| TV-MA| 1 Season|Crime TV Shows, I...| | s6|TV Show| Midnight Mass| Mike Flanagan| United States| 9/24/2021| 2021| TV-MA| 1 Season|TV Dramas, TV Hor...| | s14| Movie|Confessions of an...| Bruno Garotti| Brazil| 9/22/2021| 2021| TV-PG| 91 min|Children & Family...| | s8| Movie| Sankofa| Haile Gerima| United States| 9/24/2021| 1993| TV-MA| 125 min|Dramas, Independe...| | s9|TV Show|The Great British...| Andy Devonshire|United Kingdom| 9/24/2021| 2021| TV-14|9 Seasons|British TV Shows,...| | s10| Movie| The Starling| Theodore Melfi| United States| 9/24/2021| 2021| PG-13| 104 min| Comedies, Dramas| | s939| Movie|Motu Patlu in the...| Suhas Kadav| India| 5/1/2021| 2019| TV-Y7| 87 min|Children & Family...| | s13| Movie| Je Suis Karl| Christian Schwochow| Germany| 9/23/2021| 2021| TV-MA| 127 min|Dramas, Internati...| | s940| Movie|Motu Patlu in Won...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 76 min|Children & Family...| | s941| Movie|Motu Patlu: Deep ...| Suhas Kadav| India| 5/1/2021| 2014| TV-Y7| 76 min|Children & Family...| | s942| Movie|Motu Patlu: Missi...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 71 min|Children & Family...| | s852| Movie| 99 Songs (Tamil)| Not Given| Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| | s471| Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021| 2021| TV-14| 39 min| Movies| | s730| Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 36 min| Movies| | s731| Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 34 min| Movies| | s913| Movie|The Circle - The ...|Krysia Plonka, Kr...| United States| 5/7/2021| 2021| TV-14| 35 min| Comedies| | s4|TV Show|Jailbirds New Orl...| Not Given| Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| | s15|TV Show|Crime Stories: In...| Not Given| Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| | s3232| Movie| True: Winter Wishes|Mark Thornton, To...| United States|11/26/2019| 2019| TV-Y| 46 min|Children & Family...| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Print Schema
df.printSchema()
root |-- show_id: string (nullable = true) |-- type: string (nullable = true) |-- title: string (nullable = true) |-- director: string (nullable = true) |-- country: string (nullable = true) |-- date_added: string (nullable = true) |-- release_year: string (nullable = true) |-- rating: string (nullable = true) |-- duration: string (nullable = true) |-- listed_in: string (nullable = true)
# Print Columns
df.columns
['show_id', 'type', 'title', 'director', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in']
# Print Head, top 5
df.head(5)
[Row(show_id='s1', type='Movie', title='Dick Johnson Is Dead', director='Kirsten Johnson', country='United States', date_added='9/25/2021', release_year='2020', rating='PG-13', duration='90 min', listed_in='Documentaries'), Row(show_id='s3', type='TV Show', title='Ganglands', director='Julien Leclercq', country='France', date_added='9/24/2021', release_year='2021', rating='TV-MA', duration='1 Season', listed_in='Crime TV Shows, International TV Shows, TV Action & Adventure'), Row(show_id='s6', type='TV Show', title='Midnight Mass', director='Mike Flanagan', country='United States', date_added='9/24/2021', release_year='2021', rating='TV-MA', duration='1 Season', listed_in='TV Dramas, TV Horror, TV Mysteries'), Row(show_id='s14', type='Movie', title='Confessions of an Invisible Girl', director='Bruno Garotti', country='Brazil', date_added='9/22/2021', release_year='2021', rating='TV-PG', duration='91 min', listed_in='Children & Family Movies, Comedies'), Row(show_id='s8', type='Movie', title='Sankofa', director='Haile Gerima', country='United States', date_added='9/24/2021', release_year='1993', rating='TV-MA', duration='125 min', listed_in='Dramas, Independent Movies, International Movies')]
# Print Data types
df.dtypes
[('show_id', 'string'), ('type', 'string'), ('title', 'string'), ('director', 'string'), ('country', 'string'), ('date_added', 'string'), ('release_year', 'string'), ('rating', 'string'), ('duration', 'string'), ('listed_in', 'string')]
# describe
df.describe().show()
+-------+----------------+-------------+---------------------------------+------------+--------+----------+------------------+--------------------+--------+------------------+ |summary| show_id| type| title| director| country|date_added| release_year| rating|duration| listed_in| +-------+----------------+-------------+---------------------------------+------------+--------+----------+------------------+--------------------+--------+------------------+ | count| 8791| 8791| 8791| 8790| 8790| 8790| 8790| 8790| 8789| 8789| | mean| null| null| 1124.7692307692307| null| 1944.0| null|2014.1911480259416| null| null| null| | stddev| null| null| 1042.1800991068478| null| null| null| 8.79415428974682| null| null| null| | min|Flying Fortress"| Movie| #Alive| 3/31/2017| 1944| 1/1/2008| 1925|Classic Movies, D...|1 Season|Action & Adventure| | max| s999|William Wyler|최강전사 미니특공대 : 영웅의 탄생|Şenol Sönmez|Zimbabwe| TV-PG| 40 min| UR| 99 min| Thrillers| +-------+----------------+-------------+---------------------------------+------------+--------+----------+------------------+--------------------+--------+------------------+
# Selecting a column "type"
df.select('type').show()
+-------+ | type| +-------+ | Movie| |TV Show| |TV Show| | Movie| | Movie| |TV Show| | Movie| | Movie| | Movie| | Movie| | Movie| | Movie| | Movie| | Movie| | Movie| | Movie| | Movie| |TV Show| |TV Show| | Movie| +-------+ only showing top 20 rows
# Selecting Multiple columns
df.select(['type','country']).show()
+-------+--------------+ | type| country| +-------+--------------+ | Movie| United States| |TV Show| France| |TV Show| United States| | Movie| Brazil| | Movie| United States| |TV Show|United Kingdom| | Movie| United States| | Movie| India| | Movie| Germany| | Movie| India| | Movie| India| | Movie| India| | Movie| Pakistan| | Movie| United States| | Movie| United States| | Movie| United States| | Movie| United States| |TV Show| Pakistan| |TV Show| Pakistan| | Movie| United States| +-------+--------------+ only showing top 20 rows
# Adding a column "New Year"
test = df.withColumn('New Year',df['release_year']+1)
test.select(['New Year','release_year']).show()
+--------+------------+ |New Year|release_year| +--------+------------+ | 2021.0| 2020| | 2022.0| 2021| | 2022.0| 2021| | 2022.0| 2021| | 1994.0| 1993| | 2022.0| 2021| | 2022.0| 2021| | 2020.0| 2019| | 2022.0| 2021| | 2014.0| 2013| | 2015.0| 2014| | 2014.0| 2013| | 2022.0| 2021| | 2022.0| 2021| | 2022.0| 2021| | 2022.0| 2021| | 2022.0| 2021| | 2022.0| 2021| | 2022.0| 2021| | 2020.0| 2019| +--------+------------+ only showing top 20 rows
# Renaming "New Year" column to "Great Year"
test1 = test.withColumnRenamed('New Year','Great Year')
test1.columns
['show_id', 'type', 'title', 'director', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'Great Year']
# Dropping a column
test1 = test1.drop('Great Year')
test1.columns
['show_id', 'type', 'title', 'director', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in']
# Drop all N/A's
df.na.drop().show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ | s1| Movie|Dick Johnson Is Dead| Kirsten Johnson| United States| 9/25/2021| 2020| PG-13| 90 min| Documentaries| | s3|TV Show| Ganglands| Julien Leclercq| France| 9/24/2021| 2021| TV-MA| 1 Season|Crime TV Shows, I...| | s6|TV Show| Midnight Mass| Mike Flanagan| United States| 9/24/2021| 2021| TV-MA| 1 Season|TV Dramas, TV Hor...| | s14| Movie|Confessions of an...| Bruno Garotti| Brazil| 9/22/2021| 2021| TV-PG| 91 min|Children & Family...| | s8| Movie| Sankofa| Haile Gerima| United States| 9/24/2021| 1993| TV-MA| 125 min|Dramas, Independe...| | s9|TV Show|The Great British...| Andy Devonshire|United Kingdom| 9/24/2021| 2021| TV-14|9 Seasons|British TV Shows,...| | s10| Movie| The Starling| Theodore Melfi| United States| 9/24/2021| 2021| PG-13| 104 min| Comedies, Dramas| | s939| Movie|Motu Patlu in the...| Suhas Kadav| India| 5/1/2021| 2019| TV-Y7| 87 min|Children & Family...| | s13| Movie| Je Suis Karl| Christian Schwochow| Germany| 9/23/2021| 2021| TV-MA| 127 min|Dramas, Internati...| | s940| Movie|Motu Patlu in Won...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 76 min|Children & Family...| | s941| Movie|Motu Patlu: Deep ...| Suhas Kadav| India| 5/1/2021| 2014| TV-Y7| 76 min|Children & Family...| | s942| Movie|Motu Patlu: Missi...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 71 min|Children & Family...| | s852| Movie| 99 Songs (Tamil)| Not Given| Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| | s471| Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021| 2021| TV-14| 39 min| Movies| | s730| Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 36 min| Movies| | s731| Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 34 min| Movies| | s913| Movie|The Circle - The ...|Krysia Plonka, Kr...| United States| 5/7/2021| 2021| TV-14| 35 min| Comedies| | s4|TV Show|Jailbirds New Orl...| Not Given| Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| | s15|TV Show|Crime Stories: In...| Not Given| Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| | s3232| Movie| True: Winter Wishes|Mark Thornton, To...| United States|11/26/2019| 2019| TV-Y| 46 min|Children & Family...| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Other Drop Variations
# drop any N/A's
# df.na.drop(how="any").show()
# drop only if all values are null
# df.na.drop(how="all").show()
# drop if 2 or more N/A's
# df.na.drop(how="any",thresh=2).show()
# drop N/A's in certain columns
# df.na.drop(how="any",subset=['director']).show()
# Filling N/A values
# df.na.fill('Missing Values').show()
# df.na.fill('Missing Values',['A','B']).show()
# Use from pyspark.ml.feature import Imputer to do imputation
# Filtering functions
df.filter(df['country']=="Pakistan").show()
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+ | s852| Movie| 99 Songs (Tamil)|Not Given|Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| | s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| | s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| | s20|TV Show| Jaguar|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|International TV ...| | s32|TV Show| Chicago Party Aunt|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season| TV Comedies| | s34|TV Show| Squid Game|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season|International TV ...| | s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021| 2020| TV-Y7| 1 Season| Kids' TV| | s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021| 2021| TV-PG|2 Seasons| Reality TV| | s84|TV Show| Metal Shop Masters|Not Given|Pakistan| 9/10/2021| 2021| TV-MA| 1 Season| Reality TV| | s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021| 2021| TV-Y7| 1 Season|Anime Series, Kid...| | s88|TV Show| Titipo Titipo|Not Given|Pakistan| 9/10/2021| 2019| TV-Y|2 Seasons|Kids' TV, Korean ...| | s90|TV Show| Mighty Raju|Not Given|Pakistan| 9/9/2021| 2017| TV-Y7|4 Seasons| Kids' TV| | s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan| 9/7/2021| 2019| TV-Y7|2 Seasons| Kids' TV| | s122|TV Show| Hotel Del Luna|Not Given|Pakistan| 9/2/2021| 2019| TV-14| 1 Season|International TV ...| | s133|TV Show|Brave Animated Se...|Not Given|Pakistan| 9/1/2021| 2021| TV-MA| 1 Season|International TV ...| | s148|TV Show| How to Be a Cowboy|Not Given|Pakistan| 9/1/2021| 2021| TV-PG| 1 Season| Reality TV| | s166|TV Show| Oldsters|Not Given|Pakistan| 9/1/2021| 2019| TV-MA| 1 Season|Crime TV Shows, I...| | s190|TV Show| Bread Barbershop|Not Given|Pakistan| 8/28/2021| 2020| TV-Y|2 Seasons|Kids' TV, TV Come...| | s182|TV Show|Turning Point: 9/...|Not Given|Pakistan| 9/1/2021| 2021| TV-14| 1 Season| Docuseries| | s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021| 2021| TV-14| 1 Season|International TV ...| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Using 2 conditions
df.filter((df['country']=="Pakistan") & (df['type']=="TV Show")).show()
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+ | s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| | s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| | s20|TV Show| Jaguar|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|International TV ...| | s32|TV Show| Chicago Party Aunt|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season| TV Comedies| | s34|TV Show| Squid Game|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season|International TV ...| | s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021| 2020| TV-Y7| 1 Season| Kids' TV| | s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021| 2021| TV-PG|2 Seasons| Reality TV| | s84|TV Show| Metal Shop Masters|Not Given|Pakistan| 9/10/2021| 2021| TV-MA| 1 Season| Reality TV| | s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021| 2021| TV-Y7| 1 Season|Anime Series, Kid...| | s88|TV Show| Titipo Titipo|Not Given|Pakistan| 9/10/2021| 2019| TV-Y|2 Seasons|Kids' TV, Korean ...| | s90|TV Show| Mighty Raju|Not Given|Pakistan| 9/9/2021| 2017| TV-Y7|4 Seasons| Kids' TV| | s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan| 9/7/2021| 2019| TV-Y7|2 Seasons| Kids' TV| | s122|TV Show| Hotel Del Luna|Not Given|Pakistan| 9/2/2021| 2019| TV-14| 1 Season|International TV ...| | s133|TV Show|Brave Animated Se...|Not Given|Pakistan| 9/1/2021| 2021| TV-MA| 1 Season|International TV ...| | s148|TV Show| How to Be a Cowboy|Not Given|Pakistan| 9/1/2021| 2021| TV-PG| 1 Season| Reality TV| | s166|TV Show| Oldsters|Not Given|Pakistan| 9/1/2021| 2019| TV-MA| 1 Season|Crime TV Shows, I...| | s190|TV Show| Bread Barbershop|Not Given|Pakistan| 8/28/2021| 2020| TV-Y|2 Seasons|Kids' TV, TV Come...| | s182|TV Show|Turning Point: 9/...|Not Given|Pakistan| 9/1/2021| 2021| TV-14| 1 Season| Docuseries| | s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021| 2021| TV-14| 1 Season|International TV ...| | s188|TV Show|The Ingenuity of ...|Not Given|Pakistan| 8/29/2021| 2021| TV-G| 1 Season|International TV ...| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Use numeric filter
df.filter(df['release_year']<=2019).show()
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| +-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+ | s8| Movie| Sankofa| Haile Gerima|United States| 9/24/2021| 1993| TV-MA| 125 min|Dramas, Independe...| | s939| Movie|Motu Patlu in the...| Suhas Kadav| India| 5/1/2021| 2019| TV-Y7| 87 min|Children & Family...| | s940| Movie|Motu Patlu in Won...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 76 min|Children & Family...| | s941| Movie|Motu Patlu: Deep ...| Suhas Kadav| India| 5/1/2021| 2014| TV-Y7| 76 min|Children & Family...| | s942| Movie|Motu Patlu: Missi...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 71 min|Children & Family...| | s3232| Movie| True: Winter Wishes|Mark Thornton, To...|United States|11/26/2019| 2019| TV-Y| 46 min|Children & Family...| | s4832|TV Show|True: Magical Fri...|Mark Thornton, To...|United States| 6/15/2018| 2018| TV-Y| 1 Season| Kids' TV| | s4833|TV Show|True: Wonderful W...|Mark Thornton, To...|United States| 6/15/2018| 2018| TV-Y| 1 Season| Kids' TV| | s4857|TV Show|Dance & Sing with...|Mark Thornton, To...|United States| 5/18/2018| 2018| TV-Y| 1 Season| Kids' TV| | s7930| Movie| Samudri Lootere| Anirban Majumder| Not Given| 6/18/2019| 2018| TV-Y| 65 min|Children & Family...| | s25| Movie| Jeans| S. Shankar| India| 9/21/2021| 1998| TV-14| 166 min|Comedies, Interna...| | s28| Movie| Grown Ups| Dennis Dugan|United States| 9/20/2021| 2010| PG-13| 103 min| Comedies| | s29| Movie| Dark Skies| Scott Stewart|United States| 9/19/2021| 2013| PG-13| 97 min|Horror Movies, Sc...| | s30| Movie| Paranoia| Robert Luketic|United States| 9/19/2021| 2013| PG-13| 106 min| Thrillers| | s88|TV Show| Titipo Titipo| Not Given| Pakistan| 9/10/2021| 2019| TV-Y|2 Seasons|Kids' TV, Korean ...| | s90|TV Show| Mighty Raju| Not Given| Pakistan| 9/9/2021| 2017| TV-Y7|4 Seasons| Kids' TV| | s101|TV Show|Tobot Galaxy Dete...| Not Given| Pakistan| 9/7/2021| 2019| TV-Y7|2 Seasons| Kids' TV| | s122|TV Show| Hotel Del Luna| Not Given| Pakistan| 9/2/2021| 2019| TV-14| 1 Season|International TV ...| | s166|TV Show| Oldsters| Not Given| Pakistan| 9/1/2021| 2019| TV-MA| 1 Season|Crime TV Shows, I...| | s27| Movie| Minsara Kanavu| Rajiv Menon| India| 9/21/2021| 1997| TV-PG| 147 min|Comedies, Interna...| +-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Filtering and Selecting
df.filter(df['release_year']<=2019).select(['title','country']).show()
+--------------------+-------------+ | title| country| +--------------------+-------------+ | Sankofa|United States| |Motu Patlu in the...| India| |Motu Patlu in Won...| India| |Motu Patlu: Deep ...| India| |Motu Patlu: Missi...| India| | True: Winter Wishes|United States| |True: Magical Fri...|United States| |True: Wonderful W...|United States| |Dance & Sing with...|United States| | Samudri Lootere| Not Given| | Jeans| India| | Grown Ups|United States| | Dark Skies|United States| | Paranoia|United States| | Titipo Titipo| Pakistan| | Mighty Raju| Pakistan| |Tobot Galaxy Dete...| Pakistan| | Hotel Del Luna| Pakistan| | Oldsters| Pakistan| | Minsara Kanavu| India| +--------------------+-------------+ only showing top 20 rows
# Using the not "~" filter
df.filter(~(df['release_year']==2021)).show()
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in| +-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+ | s1| Movie|Dick Johnson Is Dead| Kirsten Johnson|United States| 9/25/2021| 2020| PG-13| 90 min| Documentaries| | s8| Movie| Sankofa| Haile Gerima|United States| 9/24/2021| 1993| TV-MA| 125 min|Dramas, Independe...| | s939| Movie|Motu Patlu in the...| Suhas Kadav| India| 5/1/2021| 2019| TV-Y7| 87 min|Children & Family...| | s940| Movie|Motu Patlu in Won...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 76 min|Children & Family...| | s941| Movie|Motu Patlu: Deep ...| Suhas Kadav| India| 5/1/2021| 2014| TV-Y7| 76 min|Children & Family...| | s942| Movie|Motu Patlu: Missi...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 71 min|Children & Family...| | s3232| Movie| True: Winter Wishes|Mark Thornton, To...|United States|11/26/2019| 2019| TV-Y| 46 min|Children & Family...| | s4832|TV Show|True: Magical Fri...|Mark Thornton, To...|United States| 6/15/2018| 2018| TV-Y| 1 Season| Kids' TV| | s4833|TV Show|True: Wonderful W...|Mark Thornton, To...|United States| 6/15/2018| 2018| TV-Y| 1 Season| Kids' TV| | s4857|TV Show|Dance & Sing with...|Mark Thornton, To...|United States| 5/18/2018| 2018| TV-Y| 1 Season| Kids' TV| | s17| Movie|Europe's Most Dan...|Pedro de Echave G...| Not Given| 9/22/2021| 2020| TV-MA| 67 min|Documentaries, In...| | s7930| Movie| Samudri Lootere| Anirban Majumder| Not Given| 6/18/2019| 2018| TV-Y| 65 min|Children & Family...| | s25| Movie| Jeans| S. Shankar| India| 9/21/2021| 1998| TV-14| 166 min|Comedies, Interna...| | s28| Movie| Grown Ups| Dennis Dugan|United States| 9/20/2021| 2010| PG-13| 103 min| Comedies| | s29| Movie| Dark Skies| Scott Stewart|United States| 9/19/2021| 2013| PG-13| 97 min|Horror Movies, Sc...| | s30| Movie| Paranoia| Robert Luketic|United States| 9/19/2021| 2013| PG-13| 106 min| Thrillers| | s35|TV Show|Tayo and Little W...| Not Given| Pakistan| 9/17/2021| 2020| TV-Y7| 1 Season| Kids' TV| | s88|TV Show| Titipo Titipo| Not Given| Pakistan| 9/10/2021| 2019| TV-Y|2 Seasons|Kids' TV, Korean ...| | s90|TV Show| Mighty Raju| Not Given| Pakistan| 9/9/2021| 2017| TV-Y7|4 Seasons| Kids' TV| | s101|TV Show|Tobot Galaxy Dete...| Not Given| Pakistan| 9/7/2021| 2019| TV-Y7|2 Seasons| Kids' TV| +-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+ only showing top 20 rows
# Cast release_year column to numeric
from pyspark.sql.types import DecimalType
df1 = df.withColumn("New_year", df["release_year"].cast(DecimalType(precision=12, scale=2)))
df1.dtypes
[('show_id', 'string'), ('type', 'string'), ('title', 'string'), ('director', 'string'), ('country', 'string'), ('date_added', 'string'), ('release_year', 'string'), ('rating', 'string'), ('duration', 'string'), ('listed_in', 'string'), ('New_year', 'decimal(12,2)')]
# Using Group By and Aggragate functions
# Use Sum
df1.groupBy('type').sum().show()
+-------------+-------------+ | type|sum(New_year)| +-------------+-------------+ | TV Show| 5372296.00| | Movie| 12330430.00| |William Wyler| null| +-------------+-------------+
# # Other functions mean(),count(),max(),avg()
df1.groupby('type').mean().show()
+-------------+-------------+ | type|avg(New_year)| +-------------+-------------+ | TV Show| 2016.627628| | Movie| 2013.131429| |William Wyler| null| +-------------+-------------+
# Use dictionary to apply directly on a column
df1.agg({'new_year':'sum'}).show()
+-------------+ |sum(new_year)| +-------------+ | 17702726.00| +-------------+
# To use Spark SQL, create a temp view
df1.createOrReplaceTempView("df1")
# Now we can use SQL queries like normal
spark.sql("SELECT * FROM df1").show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+--------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in|New_year| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+--------+ | s1| Movie|Dick Johnson Is Dead| Kirsten Johnson| United States| 9/25/2021| 2020| PG-13| 90 min| Documentaries| 2020.00| | s3|TV Show| Ganglands| Julien Leclercq| France| 9/24/2021| 2021| TV-MA| 1 Season|Crime TV Shows, I...| 2021.00| | s6|TV Show| Midnight Mass| Mike Flanagan| United States| 9/24/2021| 2021| TV-MA| 1 Season|TV Dramas, TV Hor...| 2021.00| | s14| Movie|Confessions of an...| Bruno Garotti| Brazil| 9/22/2021| 2021| TV-PG| 91 min|Children & Family...| 2021.00| | s8| Movie| Sankofa| Haile Gerima| United States| 9/24/2021| 1993| TV-MA| 125 min|Dramas, Independe...| 1993.00| | s9|TV Show|The Great British...| Andy Devonshire|United Kingdom| 9/24/2021| 2021| TV-14|9 Seasons|British TV Shows,...| 2021.00| | s10| Movie| The Starling| Theodore Melfi| United States| 9/24/2021| 2021| PG-13| 104 min| Comedies, Dramas| 2021.00| | s939| Movie|Motu Patlu in the...| Suhas Kadav| India| 5/1/2021| 2019| TV-Y7| 87 min|Children & Family...| 2019.00| | s13| Movie| Je Suis Karl| Christian Schwochow| Germany| 9/23/2021| 2021| TV-MA| 127 min|Dramas, Internati...| 2021.00| | s940| Movie|Motu Patlu in Won...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 76 min|Children & Family...| 2013.00| | s941| Movie|Motu Patlu: Deep ...| Suhas Kadav| India| 5/1/2021| 2014| TV-Y7| 76 min|Children & Family...| 2014.00| | s942| Movie|Motu Patlu: Missi...| Suhas Kadav| India| 5/1/2021| 2013| TV-Y7| 71 min|Children & Family...| 2013.00| | s852| Movie| 99 Songs (Tamil)| Not Given| Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| 2021.00| | s471| Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021| 2021| TV-14| 39 min| Movies| 2021.00| | s730| Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 36 min| Movies| 2021.00| | s731| Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021| 2021| TV-MA| 34 min| Movies| 2021.00| | s913| Movie|The Circle - The ...|Krysia Plonka, Kr...| United States| 5/7/2021| 2021| TV-14| 35 min| Comedies| 2021.00| | s4|TV Show|Jailbirds New Orl...| Not Given| Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| 2021.00| | s15|TV Show|Crime Stories: In...| Not Given| Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| 2021.00| | s3232| Movie| True: Winter Wishes|Mark Thornton, To...| United States|11/26/2019| 2019| TV-Y| 46 min|Children & Family...| 2019.00| +-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+--------+ only showing top 20 rows
spark.sql("SELECT * FROM df1 WHERE country='Pakistan'").show()
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in|New_year| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+ | s852| Movie| 99 Songs (Tamil)|Not Given|Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| 2021.00| | s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| 2021.00| | s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| 2021.00| | s20|TV Show| Jaguar|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|International TV ...| 2021.00| | s32|TV Show| Chicago Party Aunt|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season| TV Comedies| 2021.00| | s34|TV Show| Squid Game|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season|International TV ...| 2021.00| | s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021| 2020| TV-Y7| 1 Season| Kids' TV| 2020.00| | s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021| 2021| TV-PG|2 Seasons| Reality TV| 2021.00| | s84|TV Show| Metal Shop Masters|Not Given|Pakistan| 9/10/2021| 2021| TV-MA| 1 Season| Reality TV| 2021.00| | s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021| 2021| TV-Y7| 1 Season|Anime Series, Kid...| 2021.00| | s88|TV Show| Titipo Titipo|Not Given|Pakistan| 9/10/2021| 2019| TV-Y|2 Seasons|Kids' TV, Korean ...| 2019.00| | s90|TV Show| Mighty Raju|Not Given|Pakistan| 9/9/2021| 2017| TV-Y7|4 Seasons| Kids' TV| 2017.00| | s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan| 9/7/2021| 2019| TV-Y7|2 Seasons| Kids' TV| 2019.00| | s122|TV Show| Hotel Del Luna|Not Given|Pakistan| 9/2/2021| 2019| TV-14| 1 Season|International TV ...| 2019.00| | s133|TV Show|Brave Animated Se...|Not Given|Pakistan| 9/1/2021| 2021| TV-MA| 1 Season|International TV ...| 2021.00| | s148|TV Show| How to Be a Cowboy|Not Given|Pakistan| 9/1/2021| 2021| TV-PG| 1 Season| Reality TV| 2021.00| | s166|TV Show| Oldsters|Not Given|Pakistan| 9/1/2021| 2019| TV-MA| 1 Season|Crime TV Shows, I...| 2019.00| | s190|TV Show| Bread Barbershop|Not Given|Pakistan| 8/28/2021| 2020| TV-Y|2 Seasons|Kids' TV, TV Come...| 2020.00| | s182|TV Show|Turning Point: 9/...|Not Given|Pakistan| 9/1/2021| 2021| TV-14| 1 Season| Docuseries| 2021.00| | s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021| 2021| TV-14| 1 Season|International TV ...| 2021.00| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+ only showing top 20 rows
# Assign new spark dataframe from sql query
pakistan_data = spark.sql("SELECT * FROM df1 WHERE country='Pakistan'")
pakistan_data.show()
type(pakistan_data)
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+ |show_id| type| title| director| country|date_added|release_year|rating| duration| listed_in|New_year| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+ | s852| Movie| 99 Songs (Tamil)|Not Given|Pakistan| 5/21/2021| 2021| TV-14| 131 min|Dramas, Internati...| 2021.00| | s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021| 2021| TV-MA| 1 Season|Docuseries, Reali...| 2021.00| | s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|British TV Shows,...| 2021.00| | s20|TV Show| Jaguar|Not Given|Pakistan| 9/22/2021| 2021| TV-MA| 1 Season|International TV ...| 2021.00| | s32|TV Show| Chicago Party Aunt|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season| TV Comedies| 2021.00| | s34|TV Show| Squid Game|Not Given|Pakistan| 9/17/2021| 2021| TV-MA| 1 Season|International TV ...| 2021.00| | s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021| 2020| TV-Y7| 1 Season| Kids' TV| 2020.00| | s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021| 2021| TV-PG|2 Seasons| Reality TV| 2021.00| | s84|TV Show| Metal Shop Masters|Not Given|Pakistan| 9/10/2021| 2021| TV-MA| 1 Season| Reality TV| 2021.00| | s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021| 2021| TV-Y7| 1 Season|Anime Series, Kid...| 2021.00| | s88|TV Show| Titipo Titipo|Not Given|Pakistan| 9/10/2021| 2019| TV-Y|2 Seasons|Kids' TV, Korean ...| 2019.00| | s90|TV Show| Mighty Raju|Not Given|Pakistan| 9/9/2021| 2017| TV-Y7|4 Seasons| Kids' TV| 2017.00| | s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan| 9/7/2021| 2019| TV-Y7|2 Seasons| Kids' TV| 2019.00| | s122|TV Show| Hotel Del Luna|Not Given|Pakistan| 9/2/2021| 2019| TV-14| 1 Season|International TV ...| 2019.00| | s133|TV Show|Brave Animated Se...|Not Given|Pakistan| 9/1/2021| 2021| TV-MA| 1 Season|International TV ...| 2021.00| | s148|TV Show| How to Be a Cowboy|Not Given|Pakistan| 9/1/2021| 2021| TV-PG| 1 Season| Reality TV| 2021.00| | s166|TV Show| Oldsters|Not Given|Pakistan| 9/1/2021| 2019| TV-MA| 1 Season|Crime TV Shows, I...| 2019.00| | s190|TV Show| Bread Barbershop|Not Given|Pakistan| 8/28/2021| 2020| TV-Y|2 Seasons|Kids' TV, TV Come...| 2020.00| | s182|TV Show|Turning Point: 9/...|Not Given|Pakistan| 9/1/2021| 2021| TV-14| 1 Season| Docuseries| 2021.00| | s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021| 2021| TV-14| 1 Season|International TV ...| 2021.00| +-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+ only showing top 20 rows
pyspark.sql.dataframe.DataFrame